** PSID education data cleaning for The Effects of Changes in Local Bank Health on Household Consumption
** by Daniel Cooper and Joe Peek, ReStat 2020
/* ////////////////////////////////////////////////////////////////////////// */
clear all
set more off

* this code cleans the PSID education variables, it requires output from psid_agesex_samehead.do

* the education data for some PSID members jumps around in ways that does not seem plausible and this
* code attempts to fix it--the PSID variable codes for the raw PSID data that we use are listed below

* there are a couple of education variables in the PSID and we try to reconcile and combine data from the 
* two where possible and relevant. 

* update with relevant file path
local psid "~/PSID_new"
cd `psid'/Education

use educraw_hmr_long.dta, clear
xtset unique year

*COMPLETED ED-HD (Values 1-17)
*local educh1 [75]V4093 [76]V4684 [77]V5608 [78]V6157 [79]V6754 [80]V7387 [81]V8039 [82]V8663 [83]V9349 [84]V10996 [91]V20198 [92]V21504 [93]V23333 [94]ER4158 [95]ER6998 [96]ER9249 [97]ER12222 [99]ER16516 [01]ER20457 [03]ER24148 [05]ER28047 [07]ER41037 [09]ER46981 [11]ER52405 [13]ER58223 [15]ER65459 [17]ER71538
replace educh1 = . if educh1 == 99 
lab def educ1h 17 "At least some postgraduate work"
lab val educh1 educ1h
lab var educh1 "How many grades of school did you (HEAD) finish?"

*COMPLETED ED-WF (1-17)
*local educw1 [75]V4102 [76]V4695 [77]V5567 [78]V6116 [79]V6713 [80]V7346 [81]V7998 [82]V8622 [83]V9308 [84]V10955 [91]V20199 [92]V21505 [93]V23334 [94]ER4159 [95]ER6999 [96]ER9250 [97]ER12223 [99]ER16517 [01]ER20458 [03]ER24149 [05]ER28048 [07]ER41038 [09]ER46982 [11]ER52406 [13]ER58224 [15]ER65460 [17]ER71539
replace educw1 = . if educw1 == 99
lab def educ1w 17 "At least some postgraduate work"
lab val educw1 educ1w
lab var educw1 "How many grades of school did your Wife finish?"

*EDUCATION_HEAD (Values 1-9)
*educh2 [68]V313 [69]V794 [70]V1485 [71]V2197 [72]V2823 [73]V3241 [74]V3663 [75]V4198 [76]V5074 [77]V5647 [78]V6194 [79]V6787 [80]V7433 [81]V8085 [82]V8709 [83]V9395 [84]V11042 [85]V12400 [86]V13640 [87]V14687 [88]V16161 [89]V17545 [90]V18898
replace educh2 = . if educh2 == 9
* 0 "could not read or write" is combined with 1 "0-5 grades or mentions could not read or write", as 1
replace educh2 = 1 if educh2 == 0
*This variable originally splits high school graduates into two categories, with and without "non-academic training". 
*Since we do not have an equivalent for educ1, I combine them. Everything else gets shifted down one so they are conseq. integers
replace educh2 = 4 if educh2 == 5
replace educh2 = 5 if educh2 == 6
replace educh2 = 6 if educh2 == 7
replace educh2 = 7 if educh2 == 8

#delimit ;
lab def educ2h 
1 "0 - 5 grades" 
2 "6 - 8 grades" 
3 "9 - 11 grades" 
4 "12 grades (completed high school)"
5 "College, no degree"
6 "College, bachelors degree (A.B., B.S., etc.)"
7 "College, advanced or professional degrees (M.A., Ph.D., LLB, BD, M.S., etc." ;
#delimit cr
lab val educh2 educ2h
lab var educh2 "How many grades of school did you (HEAD) finish?"

*EDUCATION-WIFE (1-9)  /notice 1969-1971 are missing, so they are imputed below/
*educw2 [68]V246 [72]V2687 [73]V3216 [74]V3638 [75]V4199 [76]V5075 [77]V5648 [78]V6195 [79]V6788 [80]V7434 [81]V8086 [82]V8710 [83]V9396 [84]V11043 [85]V12401 [86]V13641 [87]V14688 [88]V16162 [89]V17546 [90]V18899
replace educw2 = . if educw2 == 9
*for wife zero means no wife
replace educw2 = . if educw2 == 0
*This variable originally splits high school graduates into two categories, with and without "non-academic training". 
*Since we do not have an equivalent for educ1, I combine them. Everything else gets shifted down one so they are conseq. integers
replace educw2 = 4 if educw2 == 5
replace educw2 = 5 if educw2 == 6
replace educw2 = 6 if educw2 == 7
replace educw2 = 7 if educw2 == 8
#delimit ;
lab def educ2w 
1 "0 - 5 grades"
2 "6 - 8 grades, grade school; DK but mentions could read or write"
3 "9 - 11 grades (some high school)"
4 "12 grades (completed high school)"
5 "Some college, no degree"
6 "College, bachelors degree"
7 "College, advanced or professional degree; some graduate work; close to receiving degree"
;
#delimit cr
lab val educw2 educ2w
lab var educw2 "How many grades of school did your wife finish?"

* LAST KNOWN MARRITAL STATUS
*local mstatus [68]V239 [69]V607 [70]V1365 [71]V2072 [72]V2670 [73]V3181 [74]V3598 [75]V4053 [76]V4603 [77]V5650 [78]V6197 [79]V6790 [80]V7435 [81]V8087 [82]V8711 [83]V9419 [84]V11065 [85]V12426 [86]V13665 [87]V14712 [88]V16187 [89]V17565 [90]V18916 [91]V20216 [92]V21522 [93]V23336 [94]ER4159A [95]ER6999A [96]ER9250A [97]ER12223A [99]ER16423 [01]ER20369 [03]ER24150 [05]ER28049 [07]ER41039 [09]ER46983 [11]ER52407 [13]ER58225 [15]ER65461 [17]ER71540
replace mstatus = 6 if mstatus == 8 & year == 1968 // only year for the category "Married, Spouse Absent"
lab def mstat 1 "Married" 2 "Single" 3 "Widowed" 4 "Divorced" 5 "Separated" 6 "Married, Spouse absent" 8 "DK" 9 "NA"
lab val mstatus mstat
lab var mstatus "Marriage Status"


////////////////////////////////////////////////////////////////////////////////
*merge in head/wife  age/sex data from individual data as well as corrected samehead/samewife data.
*this will also limit the sample to only heads and wives. all others kicked out.

* need age data to determine whether "jumps" in the education variable make sense since after a certain age
* large changes in education are not very plausible. 

* need samehead/wife variable to check whether head/wife has changed making education change feasible



*created in *psid_agesex_samehead.do*
merge 1:1 unique year using `psid'/samehead_fix_upd , keepusing(samehead newwife) keep(1 3) nogen

*merge in age and sex variable corrected
drop ageh agew

*created in *psid_agesex_samehead.do*
merge 1:1 unique year using `psid'/psid_age_sex_correct_upd, keep(3) nogen

xtset unique year



* FIRST fix age data similar to what we do in the age/earnings code (see it for more details)
* basically correcting deviations in age that do not make sense; figure out majority age path for individual
* and assign other ages to be consistent with that path if they devaiate. 

*AGE FIXES --  
*instances where age will jump up for one period and then back down to its expected value. consider typo and fix.
*after fix these typos, see if ages are on the same 'age path'. whichever path is dominant for the person, revise all other ages to match that path
*then fix if age repeats more than twice in a row.
*not perfect - makes some big assumptions. 

*note that age can deviate by 1 year in either direction from its expected value due to when in the year the person was interviewed. 
	*ex: interviewed july 1990, age 26. turn 27 in october 1990. turn 28 in october 1991. interviewed again december 1991. so jumped from 26 to 28 from 1990 to 1991.
	*can also happen where age does not change between two years. 
*in last step smooth over these to clean up noise 
				
foreach v in ageh agew {

	replace `v' = . if  `v' == 999

	*creating birth year for first observation
	sort unique year
	by unique: g birthyear = year[1] -  `v'[1]
	by unique: replace birthyear = year[2] -  `v'[2] if  `v'[2] != . &  `v'[1] == .  
	by unique: replace birthyear = year[3] -  `v'[3] if  `v'[3] != . &  `v'[2] == . &  `v'[1] == . 
		*stop getting any changes after 3 
		*making sure get an birth year even if first few ageh observations are missing 

	*assign minimum birthyear that exists in each unique. sometimes vary because age doesn't change in two years due to interview month 
	bysort unique: ereplace birthyear = min(birthyear)

	*create age path based on birthyear
	g age_path = year - birthyear 

	*count how often households are off of their path -- if first obs wrong, then entire path wrong
	g off_path = 1 if  `v' != age_path &  `v' != age_path + 1 &  `v' != age_path - 1
	by unique: egen all_off_path = sum(off_path)

	*count what fraction of observations are off the path 
	bys unique: egen obs = count(unique)
	g off_path_frac = all_off_path / obs 

	*create a second age path for people whos off_path fraction is greater than 1/2 
	sort unique year
	by unique: g birthyear2 = year[2] -  `v'[2] if off_path_frac > .5
	by unique: replace birthyear2 = year[3] -  `v'[3] if  `v'[3] != . &  `v'[2] == . &  `v'[1] == .  & off_path_frac > .5
	*first obs can be a typo

	*assign minimum birthyear that exists in each unique. sometimes vary because age doesn't change in two years due to interview month 
	bysort unique: ereplace birthyear2 = min(birthyear2)

	*create age path based on birthyear
	g age_path2 = year - birthyear2

	*count when age does not equal age path 
	g age_bad = 1 if (`v' != age_path &  `v' != age_path + 1 &  `v' != age_path - 1) & off_path_frac <= .5
	replace age_bad = 1 if (`v' != age_path2 &  `v' != age_path2 + 1 &  `v' != age_path2 - 1) & off_path_frac > .5

	*replace age with age path if doesn't equal age path for just one period
	*this also gets if the first or last obs is off the path the the subsequent/preceeding is not 
	bys unique: replace `v' = age_path if age_bad == 1 & age_bad[_n-1] != 1 & age_bad[_n+1] != 1 & off_path_frac <= .5
	bys unique: replace `v' = age_path2 if age_bad == 1 & age_bad[_n-1] != 1 & age_bad[_n+1] != 1 & off_path_frac > .5
		
	bys unique: replace age_bad = 0 if age_bad == 1 & age_bad[_n-1] != 1 & age_bad[_n+1] != 1 & off_path_frac <= .5
	bys unique: replace age_bad = 0 if age_bad == 1 & age_bad[_n-1] != 1 & age_bad[_n+1] != 1 & off_path_frac > .5

	*how big is the deviation? 
	g age_path_dev = abs(`v' - age_path) if off_path_frac <= .5
	replace age_path_dev = abs(`v' - age_path2) if off_path_frac > .5

	*if have deviations that are 5 or more, want to smooth back to the age path 
	*generally it seemingly switches from one path to another. so see which path is majority
	*and then follow that path 

	*indicate 'spells' of age_path deviations (within 1)
		*even if the age path is 1-2-1, want to group the 1s together. so create a new xtset index. 
	sort unique age_path_dev
	g index = _n 
	xtset unique index
	tsspell age_path_dev, cond(age_path_dev[_n-1] == age_path_dev + 1 | age_path_dev[_n-1] == age_path_dev | age_path_dev[_n-1] == age_path_dev - 1) 

	by unique: replace _spell = _spell[_n+1] if _spell == 0 	
		*spell will be zero on first obs of a new spell because can't compare to _n-1
		
	xtset unique year 

	*generate a variable for new spell so that can modify
	g new_spell = 1 if _spell != _spell[_n-1]

	*create new spell if age decreases 
	by unique: replace new_spell = 1 if `v' < `v'[_n-1]

	*index spells 
	egen spell2 = group(unique new_spell year)
	carryforward spell2, replace

	*see which spell is the majority
	bys unique spell2: egen count_dev = count(year)
	bys unique spell2: g frac_dev = count_dev / obs

	xtset unique year 

	by unique: egen max_spell = max(frac_dev)
	by unique: g maj_spell = 1 if max_spell == frac_dev

	*create age path based on the majority spell
	by unique: g birthyear3 = year - `v' if maj_spell == 1 
	by unique: ereplace birthyear3 = min(birthyear3)
	g age_path3 = year - birthyear3 

	*reassign age to follow the majority age path 
		*vast majority of observations: will just be the one age path because always follows and already fixed obvious typos 
	replace `v' = age_path3 if maj_spell != 1 

	*if split evenly then defer to original age path -- maj_spell will be one
	*this will indicate if there are multiple spells wtih the same frac_dev and thus both max_spells
	by unique: egen max_spell1 = max(spell) if frac_dev == max_spell
	by unique: egen max_spell2 = min(spell) if frac_dev == max_spell

	replace `v' = age_path if max_spell1 != max_spell2 & frac_dev != 1

	*see if age remains the same for more than 2 periods -- the above method doesn't catch these because age_path_dev just increases by 1 consistently 
	by unique: g age_same = 1 if `v' == `v'[_n-1] & `v' == `v'[_n-2] & `v' != . & `v'[_n-1] != . & `v'[_n-2] != .
	replace age_same = 1 if age_same[_n+1] == 1
	replace age_same = 1 if age_same[_n+2] == 1

	*apply age path to these
	replace `v' = age_path if age_same == 1 & (off_path_frac <= .5 & frac_dev == 1) | (max_spell1 != max_spell2 & frac_dev != 1)
	replace `v' = age_path2 if age_same == 1 & off_path_frac > .5 & frac_dev == 1
	replace `v' = age_path3 if age_same == 1 & frac_dev != 1 & max_spell1 == max_spell2


	*now enforce the relevant age path. gets rid of age changing by 0 or 2 year to year because of interview month relative to birthday.
	*which introduces unneccessary noise. 
	*this also will fix the few remaining situations where age is bouncing around the relevant age path with deviations like 1-1-2-2-2-3-3
	replace `v' = age_path if ((off_path_frac <= .5 & frac_dev == 1) | (max_spell1 != max_spell2 & frac_dev != 1)) 
	replace `v' = age_path2 if off_path_frac > .5 & frac_dev == 1 
	replace `v' = age_path3 if frac_dev != 1 & max_spell1 == max_spell2
	
	drop birthyear* age_path* off_path all_off_path off_path_frac obs maj_spell* max_spell* _* spell2  frac_dev new_spell age_same age_bad index count_dev

}

////////////////////////////////////////////////////////////////////////////////

* Code eduation data consistently and look at discrepancies across measures


*Put educj1 into educj2 categories
foreach j in h w {
gen e1`j' =.
replace e1`j' = 1 if educ`j'1 <6
replace e1`j' = 2 if educ`j'1 >5 & educ`j'1 <9
replace e1`j' = 3 if educ`j'1 >8 & educ`j'1 <12
replace e1`j' = 4 if educ`j'1 ==12
replace e1`j' = 5 if educ`j'1 >12 & educ`j'1 <16
replace e1`j' = 6 if educ`j'1 == 16
replace e1`j' = 7 if educ`j'1 == 17
}
*Check out the conflicts
bysort unique: gen ediffh = educh2 - e1h if year >1975 & year <1991 & samehead!=0
bysort unique: gen ediffw = educw2 - e1w if year >1975 & year <1991  & newwife !=1
bysort unique: gen absediffh = abs(ediffh)
bysort unique: gen absediffw = abs(ediffw)



* EDUCATION COMBINED
*Integrate the two different education measures
//As of 3/1/18: For the 4% of obeservations for which educj1 and educj2 conflict (|ediff| > 0) I am trusting educj1 (aka e1j) over educj2. In the future you may want to consider dropping the 1% with |ediff|>1//
foreach j in h w {
replace educ`j'2 = e1`j' if absediff`j' > 0 & year >1975 & year <1991

gen educ`j' =.
replace educ`j' = 1 if e1`j' <4 | educ`j'2 <4
replace educ`j' = 2 if e1`j' == 4 | educ`j'2 == 4
replace educ`j' = 3 if e1`j' == 5 | educ`j'2 == 5
replace educ`j' = 4 if e1`j' == 6 | educ`j'2 == 6
replace educ`j' = 5 if e1`j' == 7 | educ`j'2 == 7

#delimit ;
lab def ed`j'
1 "Less than High School"
2 "High School Graduate"
3 "Some College, no degree"
4 "College Graduate, bachelors degree"
5 "At least some postgraduate work" ;
#delimit cr
lab val educ`j' ed`j'
}
lab var educh "Education Head"
lab var educw "Education Wife"

// Data Cleaning //
*Catch when educ goes up for just one period
bysort unique: replace educh = educh[_n-1] if educh[_n-1]< educh & educh > educh[_n+1] & educh > educh[_n+2] & educh !=. & educh[_n-1]!=. & educh[_n+1]!=. & educh[_n+2]!=. & samehead !=0
bysort unique: replace educw = educw[_n-1] if educw[_n-1]< educw & educw > educw[_n+1] & educw > educw[_n+2] & educw !=. & educw[_n-1]!=. & educw[_n+1]!=. & educw[_n+2]!=. & newwife !=1

*Then do not allow education to decrease, ever	(you won't un-graduate from high school)							 
bysort unique: replace educh = educh[_n-1] if educh < educh[_n-1] & educh!=. & educh[_n-1]!=. & samehead!=0
bysort unique: replace educw = educw[_n-1] if educw < educw[_n-1] & educw!=. & educw[_n-1]!=. & newwife!=1

* Don't allow education to increase if the person is over 40. 
     *(We did this because while in real life people go back to school, the psid is messy and there is a non-representative 
	 * share of people over 40 in the education increases.)
bysort unique: replace educh = educh[_n-1] if educh > educh[_n-1] & ageh >40 & educh[_n-1]!=. & samehead!=0
bysort unique: replace educw = educw[_n-1] if educw > educw[_n-1] & agew >40 & educw[_n-1]!=. & newwife!=1

*Drop wife education for single heads
replace educw = . if mstatus == 2

*Fill in values if education missing but education level was previously recorded
bysort unique: replace educh = educh[_n-1] if educh ==. & educh[_n-1]!=. &samehead!=0
bysort unique: replace educw = educw[_n-1] if educw ==. & educw[_n-1]!=. &newwife!=1

*Survey design is such that education is only asked if new head/wife or if year == 1985, 2009, or after 2013 so fill
* data as needed. 
forvalues y = 1970/1984 {
bysort unique: replace educh = educh[_n-1] if educh != educh[_n-1] & educh[_n-1] != . & year == `y' & samehead!=0
bysort unique: replace educw = educw[_n-1] if educw != educw[_n-1] & educw[_n-1] != . & year == `y' & newwife !=1
}
forvalues y = 1986/2007 {
bysort unique: replace educh = educh[_n-1] if educh != educh[_n-1] & educh[_n-1] != . & year == `y' & samehead!=0
bysort unique: replace educw = educw[_n-1] if educw != educw[_n-1] & educw[_n-1] != . & year == `y' & newwife !=1
}

*Impute values for 1969-71 educw, since they are missing
tempvar temp1
gen `temp1' = educw if year == 1968
bysort unique: egen educw1968 = min(`temp1')
tempvar temp2
gen `temp2' = year if newwife == 1
bysort unique: egen yrnewwife = min(`temp2')
replace educw = educw1968 if year > 1968 & year <1972 & year < yrnewwife

xtset unique year

cap drop _*
save education_hmr_fin.dta, replace
